library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /Users/marierivers/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/EDS_222_Final_Project
library(skimr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(stringr)
library(ggplot2)
library(patchwork)
library(gghighlight)
library(paletteer)
library(ggExtra)
library(ggrepel)
library(ggbeeswarm)
library(sf)
## Linking to GEOS 3.8.1, GDAL 3.2.1, PROJ 7.2.1
library(urbnmapr)
library(tidycensus)
library(jtools)
library(ggstance)
## 
## Attaching package: 'ggstance'
## The following objects are masked from 'package:ggplot2':
## 
##     geom_errorbarh, GeomErrorbarh
library(huxtable)
## 
## Attaching package: 'huxtable'
## The following object is masked from 'package:kableExtra':
## 
##     add_footnote
## The following object is masked from 'package:GGally':
## 
##     wrap
## The following object is masked from 'package:dplyr':
## 
##     add_rownames
## The following object is masked from 'package:ggplot2':
## 
##     theme_grey
#library(tigris)

Read in EQI data and select columns of interest

api_txt <- "~/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/Final_Project_Notes/census_key.txt"

api_key <- readLines(api_txt)
census_api_key(api_key)
## To install your API key for use in future sessions, run this function with `install = TRUE`.
fips_codes <- data.frame(fips_codes) %>%
  rename(county_name = county) %>%
  mutate(stfips = paste0(state_code, county_code))

mutate(pop_change_text = case_when( pop_change_pct > 0 ~ “positive”, pop_change_pct <= 0 ~ “negative”)) # Read in county level population data

header <- c("county_name", "april_1_2000", "2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "april_1_2010", "july_1_2010")

codes <- c("01", "02", "04", "05", "06", "08", "09", "10", "11", "12", "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", "56")
source(here("src", "state_pop_fun.R"))
county_pop = data.frame()

for (i in seq_along(codes)) {
state <- state_pop_fun(fips = codes[i])
state_df <- data.frame(state)
county_pop <- rbind(county_pop, state_df)
}
county_pop <- county_pop %>% 
  left_join(fips_codes, by = c("county_name", "state_code")) %>%
  mutate(pop_change_pct = ((july_1_2010 - X2006) / X2006) * 100) %>% 
  mutate(pop_change_text = case_when(
    pop_change_pct > 0 ~ "positive",
    pop_change_pct <= 0 ~ "negative"))
county_pop_histogram <- ggplot(data = county_pop, aes(x = pop_change_pct)) +
  geom_histogram(fill = "red", bins = 200)
county_pop_histogram

Join eqi and population data

eqi_pop <- left_join(eqi, county_pop, by = c("stfips")) %>% 
  #relocate(c(stfips, state_code, county_code, state, state_name), .before = county_name) %>% 
  select(-april_1_2000, -X2000, -X2001, -X2002, -X2003, -X2004, -X2005, -X2007, -X2008, -X2009, -july_1_2010, -april_1_2010, -county_name.y, -county_name, -state_code.y, -county_code.y, -state.y, -state_name.y) %>%
  filter(!X2006 == "null") %>%
  rename(EQI = EQI_22July2013) %>%
  rename(air_EQI = air_EQI_22July2013) %>%
  rename(water_EQI = water_EQI_22July2013) %>%
  rename(land_EQI = land_EQI_22July2013) %>%
  rename(built_EQI = built_EQI_22July2013) %>%
  rename(sociodemographic_EQI = sociod_EQI_22July2013) %>%
  rename(state_code = state_code.x) %>%
  rename(state_name = state_name.x) %>%
  rename(state = state.x) %>%
  rename(county_name = county_name.x) %>%
  rename(county_code = county_code.x) %>%
  select(-X2006)
eqi_pop_no_outlier <- eqi_pop %>%
  filter(pop_change_pct < 100)
# xxx
write_csv(eqi_pop, here("data", "eqi_pop.csv"))

Summary Statistics and Graphs

eqi_pop_change_plot_scatter <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) + 
  geom_point(aes(color = pop_change_text)) +
  theme(legend.position = "bottom") +
  labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "Population change:")

eqi_pop_change_plot <- ggMarginal(eqi_pop_change_plot_scatter, type = "boxplot", groupColour = TRUE)
eqi_pop_change_plot

eqi_pop_rucc_scatter <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) + 
  geom_point(aes(color = cat_rucc), size = .75) +
  theme(legend.position = "bottom") +
  labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")

eqi_pop_rucc_plot <- ggMarginal(eqi_pop_rucc_scatter, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot

eqi_pop_rucc_scatter2 <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) + 
  geom_point(aes(color = rucc_text), size = .75) +
  theme(legend.position = "bottom") +
  labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")

eqi_pop_rucc_plot2 <- ggMarginal(eqi_pop_rucc_scatter2, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot2

eqi_pop_rucc_scatter2_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = EQI, y = pop_change_pct)) + 
  geom_point(aes(color = rucc_text), size = .75) +
  theme(legend.position = "bottom") +
  labs(title = "County percent population change vs. EQI, 2006-2010", x = "EQI", y = " % population change", color = "RUCC:")

eqi_pop_rucc_plot2_no_outlier <- ggMarginal(eqi_pop_rucc_scatter2_no_outlier, type = "boxplot", groupColour = TRUE)
eqi_pop_rucc_plot2_no_outlier

county_pop_rucc_histogram <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
  geom_histogram(aes(fill = cat_rucc), position = "dodge", bins = 50)
county_pop_rucc_histogram

county_pop_rucc_histogram2 <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
  geom_histogram(aes(fill = rucc_text), position = "dodge", bins = 50)
county_pop_rucc_histogram2

pop_boxplot <- ggplot(data = eqi_pop, aes(x = cat_rucc, y = pop_change_pct)) +
  geom_boxplot(aes(fill = cat_rucc), show.legend = FALSE) +
  coord_flip()
pop_boxplot

pop_boxplot2 <- ggplot(data = eqi_pop, aes(x = pop_change_pct)) +
  geom_boxplot() +
  labs(title = "U.S. county level population change, 2006-2010", x = "percent population change") +
  theme(axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank())
pop_boxplot2

eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = EQI, y = pop_change_pct)) + 
  geom_point()
air_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = air_EQI, y = pop_change_pct)) + 
  geom_point()
water_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = water_EQI, y = pop_change_pct)) + 
  geom_point()
land_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = land_EQI, y = pop_change_pct)) + 
  geom_point()
sociod_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = sociod_EQI, y = pop_change_pct)) + 
  geom_point()
built_eqi_pop_change_plot <- ggplot(data = eqi_pop, aes(x = built_EQI, y = pop_change_pct)) + 
  geom_point()
p1 <- land_eqi_pop_change_plot / built_eqi_pop_change_plot
p2 <- (air_eqi_pop_change_plot | water_eqi_pop_change_plot | land_eqi_pop_change_plot) + plot_layout(widths = 1)
row1 <- (eqi_pop_change_plot | p1) + plot_layout(widths = c(2, 1))

all_eqi_plot <- (row1 / p2) + plot_layout(heights = c(3, 1))
all_eqi_plot

Patchwork with no outlier

eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))

air_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = air_EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))

water_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = water_EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))

land_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = land_EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))

sociod_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = sociodemographic_EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))

built_eqi_pop_change_plot_no_outlier <- ggplot(data = eqi_pop_no_outlier, aes(x = built_EQI, y = pop_change_pct)) + 
  geom_point() + 
  labs(y = "% pop change") + 
  theme(axis.text.y = element_text(size = 6),
        axis.text.x = element_text(size = 6))
p1_no_outlier <- land_eqi_pop_change_plot_no_outlier / built_eqi_pop_change_plot_no_outlier
p2_no_outlier <- (air_eqi_pop_change_plot_no_outlier | water_eqi_pop_change_plot_no_outlier | sociod_eqi_pop_change_plot_no_outlier) + plot_layout(widths = 1)
row1_no_outlier <- (eqi_pop_change_plot_no_outlier | p1_no_outlier) + plot_layout(widths = c(2, 1))

all_eqi_plot_no_outlier <- (row1_no_outlier / p2_no_outlier) + plot_layout(heights = c(3, 1))
all_eqi_plot_no_outlier

histogram <- ggplot(data = eqi_pop, aes(x = EQI)) +
  geom_histogram(bins = 100)
histogram

histogram_pop_change <- ggplot(data = eqi_pop, aes(x = EQI)) +
  geom_histogram(aes(fill = pop_change_text), position = "dodge", bins = 50) +
  labs(title = "Historgram of EQI based on county population change, 2006-2010", x = "EQI", fill = "Population change")
histogram_pop_change

histogram_eqi_rucc <- ggplot(data = eqi_pop, aes(x = EQI)) +
  geom_histogram(aes(fill = rucc_text), position = "dodge", bins = 50) +
  labs(title = "Historgram of EQI based on RUCC", x = "EQI", fill = "RUCC")
histogram_eqi_rucc

bin_scatter <-ggplot(data=eqi_pop, aes(x=EQI, y = pop_change_pct)) + 
  geom_bin2d(bins=50) + theme_bw() + geom_hline(yintercept=0, color="red")
bin_scatter

Statistical Evaluation

Differnt means test

null_hypothesis: There is no difference in mean EQI for counties with positive and negative population change. \[H_{0}: \mu_{posPopChange} - \mu_{negPopChange} = 0\] alternative hypothesis: There is a difference in EQI for counties with positive and negative population change. \[H_{A}: \mu_{posPopChange} - \mu_{negPopChange} \neq 0\]

The standard error for a difference in means is defined as: \(SE = \sqrt{\frac{s_1^2}{n_1} + \frac{s^2_2}{n_2}}\) and the test-statistic for a hypothesis test is \(z = \frac{\text{point estimate - null}}{SE}\)

eqi_summary_table <- eqi_pop %>%
  group_by(pop_change_text) %>%
  summarise(min_eqi = round(min(EQI),2),
            max_eqi = round(max(EQI),2),
            mean_eqi = round(mean(EQI),2),
            sd_eqi = round(sd(EQI), 2),
            var_eqi = round(var(EQI), 2),
            count = n()) %>%
  bind_rows(., eqi_pop %>%
              summarise(min_eqi = round(min(EQI),2),
                        max_eqi = round(max(EQI),2),
                        mean_eqi = round(mean(EQI),2),
                        sd_eqi = round(sd(EQI), 2),
                        var_eqi = round(var(EQI), 2),
                        count = n()) %>%
              mutate(pop_change_text = "all counties")
            ) %>%
  kable(col.names = c("Population Change", "Minimum", "Maximum", "Mean", "Standard Deviation", "Variance", "Number of Counties"), caption = "EQI summary statistics for counties based on population change between 2006 and 2010", color = 'black') %>%
  kable_paper(full_width = FALSE) %>%
  column_spec(1, bold = T) %>%
  row_spec(0, bold = T, color = 'black') %>%
  row_spec(3, bold = T, background = '#e5e5e5')
eqi_summary_table  
EQI summary statistics for counties based on population change between 2006 and 2010
Population Change Minimum Maximum Mean Standard Deviation Variance Number of Counties
negative -5.88 2.59 -0.24 0.93 0.87 1088
positive -5.05 2.85 0.13 1.01 1.02 2052
all counties -5.88 2.85 0.00 1.00 1.00 3140
# mean values
mu_eqi_pos_pop_change <- eqi_pop %>% 
  filter(pop_change_pct > 0) %>% 
  summarise(mean_eqi = mean(EQI))

mu_eqi_neg_pop_change <- eqi_pop %>% 
  filter(pop_change_pct <= 0) %>% 
  summarise(mean_eqi = mean(EQI))

# standard deviations  
sd_eqi_pos_pop_change <- eqi_pop %>% 
  filter(pop_change_pct > 0) %>% 
  summarise(sd_eqi = sd(EQI))

sd_eqi_neg_pop_change <- eqi_pop %>% 
  filter(pop_change_pct <= 0) %>% 
  summarise(sd_eqi = sd(EQI))

# count of observations 
n_pos_pop_change <- eqi_pop %>% 
  filter(pop_change_pct > 0) %>% 
  summarise(count = n())

n_neg_pop_change <- eqi_pop %>% 
  filter(pop_change_pct <= 0) %>% 
  summarise(count = n())
print(sd_eqi_neg_pop_change)
## # A tibble: 1 × 1
##   sd_eqi
##    <dbl>
## 1  0.933
# calculate point estimate
point_est <- as.numeric(mu_eqi_pos_pop_change - mu_eqi_neg_pop_change)
print(point_est)
## [1] 0.375554

To calculate a standard error for a difference in means:

n = number of observations for each group s = standard deviation for each group \[SE = \sqrt{\frac{s_1^2}{n_1} + \frac{s^2_2}{n_2}}\]

# calculate standard error
SE <- as.numeric(sqrt( (sd_eqi_pos_pop_change^2 / n_pos_pop_change) + (sd_eqi_neg_pop_change^2 / n_neg_pop_change) ))

The definition of the z-score for hypothesis testing:

\[z_{score}=\frac{\text { point estimate }-\text { null value }}{S E}\]

z_score <- (point_est - 0) / SE

Calculated p-value: the probability of getting a point estimate at least as extreme as calculates above if the null hypothesis were true: \[p \text { - value }=\operatorname{Pr}(Z<-|z| \text { or } Z>|z|)=2 * \operatorname{Pr}(Z>|z|)\]

# Make use of the function `pnorm()` to access the normal distribution. Note: `pnorm()` gives you the probability mass below a certain cutoff in a probability distribution with a mean and standard deviation you can control. Use `lower.tail=FALSE` to get the mass above a given cutoff.
p_val = 2 * pnorm(point_est, mean = 0, sd = SE, lower.tail=FALSE)
print(p_val)
## [1] 1.80126e-25

Since \(p-value = 0.018 < 0.05\) we reject the null that there is no difference in the EQI of counties with positive population change versus negative population change. There is a statistically significant difference (at the 5% significance level) in EQI across the two population change groups.

Linear Model

Build a model for EQI vs population change and a model with coefficients for each EQI category to identify the category most correlated with population change

Map

county_shp <- st_read("data/gz_2010_us_050_00_20m/gz_2010_us_050_00_20m.shp")
## Reading layer `gz_2010_us_050_00_20m' from data source 
##   `/Users/marierivers/Documents/UCSB_Environmental_Data_Science/EDS_222_Statistics_for_Environmental_Data_Science/EDS_222_Final_Project/data/gz_2010_us_050_00_20m/gz_2010_us_050_00_20m.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3221 features and 6 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1473 ymin: 17.88481 xmax: 179.7785 ymax: 71.35256
## Geodetic CRS:  NAD83

project filter only 50 states + DC try to find a shapefile that makes alaska and hawaii look good

# https://urbaninstitute.github.io/urbnmapr/
# counties_sf <- get_urbn_map("counties", sf = TRUE)
# 
# counties_sf %>% 
#   ggplot(aes()) +
#   geom_sf(fill = "grey", color = "#ffffff")
county_map_df <- county_laea %>% 
  rename(stfips = GEOID) %>% 
  right_join(eqi_pop)
## Joining, by = "stfips"
## old-style crs object detected; please recreate object with a recent sf::st_crs()

No space in blog post for maps!

library(RColorBrewer)
pal <- brewer.pal(7, "OrRd")

plot(county_map_df["EQI"],
     main = "Environmental Quality Index by County, 2006-2010",
     breaks = "quantile", 
     nbreaks = 7, 
     pal = pal)

# county_map_df2 %>% 
#   ggplot(aes()) +
#   geom_sf(aes(fill = pop_change_pct), color = "black", size = .5)

Relationship between domain specific EQI and population change

Linear regression models were created for each domain specific EQI to determine if a particular domain was more strongly correlated with population change.

eqi_model <- lm(pop_change_pct ~ EQI, data = eqi_pop)
summary(eqi_model)
## 
## Call:
## lm(formula = pop_change_pct ~ EQI, data = eqi_pop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.528  -2.839  -0.710   2.034 120.039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.90890    0.08731   21.86   <2e-16 ***
## EQI          0.99440    0.08739   11.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.892 on 3138 degrees of freedom
## Multiple R-squared:  0.03962,    Adjusted R-squared:  0.03932 
## F-statistic: 129.5 on 1 and 3138 DF,  p-value: < 2.2e-16
eqi_rucc_model <- lm(pop_change_pct ~ EQI + rucc_text, data = eqi_pop)
summary(eqi_rucc_model)
## 
## Call:
## lm(formula = pop_change_pct ~ EQI + rucc_text, data = eqi_pop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.302  -2.683  -0.658   1.916 118.553 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              0.69269    0.12295   5.634 1.92e-08 ***
## EQI                      0.30978    0.09859   3.142  0.00169 ** 
## rucc_textmetro or urban  2.70580    0.19800  13.665  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.754 on 3137 degrees of freedom
## Multiple R-squared:  0.09358,    Adjusted R-squared:  0.093 
## F-statistic: 161.9 on 2 and 3137 DF,  p-value: < 2.2e-16
air_eqi_model <- lm(pop_change_pct ~ air_EQI, data = eqi_pop)
water_eqi_model <- lm(pop_change_pct ~ water_EQI, data = eqi_pop)
land_eqi_model <- lm(pop_change_pct ~ land_EQI, data = eqi_pop)
built_eqi_model <- lm(pop_change_pct ~ built_EQI, data = eqi_pop)
sociodemographic_eqi_model <- lm(pop_change_pct ~ sociodemographic_EQI, data = eqi_pop)
plot_summs(eqi_model, air_eqi_model, water_eqi_model, land_eqi_model, built_eqi_model, sociodemographic_eqi_model, inner_ci_level = 0.9, model.names = c("EQI", "air EQI", "water EQI", "land EQI", "built EQI", "sociodem EQI"))
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed
## Loading required namespace: broom.mixed

export_summs(eqi_model, air_eqi_model, water_eqi_model, land_eqi_model, built_eqi_model, sociodemographic_eqi_model, 
             digits = 3, error_format = "[{conf.low}, {conf.high}]", 
             model.names = c("EQI", "air EQI", "water EQI", "land EQI", "built EQI", "sociodem EQI"))
EQIair EQIwater EQIland EQIbuilt EQIsociodem EQI
(Intercept)1.909 ***1.910 ***1.909 ***1.910 ***1.908 ***1.910 ***
[1.738, 2.080]   [1.739, 2.081]   [1.735, 2.083]   [1.735, 2.084]   [1.735, 2.082]   [1.739, 2.080]   
EQI0.994 ***                                        
[0.823, 1.166]                                           
air_EQI        1.038 ***                                
        [0.867, 1.209]                                   
water_EQI                0.432 ***                        
                [0.258, 0.606]                           
land_EQI                        -0.307 ***                
                        [-0.481, -0.132]                   
built_EQI                                0.641 ***        
                                [0.467, 0.815]           
sociodemographic_EQI                                        1.055 ***
                                        [0.885, 1.226]   
N3140        3140        3140        3140        3140        3140        
R20.040    0.043    0.007    0.004    0.016    0.045    
*** p < 0.001; ** p < 0.01; * p < 0.05.